From f315c78d2c0a32f333790f7ac689ee7b56f74c65 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=C3=98yvind=20Kol=C3=A5s?= Date: Thu, 31 Aug 2017 23:07:34 +0200 Subject: [PATCH] babl: refactor matrix_mul_vector further towards SIMD --- babl/babl-fish-path.c | 63 +++++------------- babl/babl-fish-reference.c | 4 +- babl/babl-matrix.h | 132 +++++++++++++++++++++++++++++++++---- 3 files changed, 137 insertions(+), 62 deletions(-) diff --git a/babl/babl-fish-path.c b/babl/babl-fish-path.c index f8b5d8d..1dd8516 100644 --- a/babl/babl-fish-path.c +++ b/babl/babl-fish-path.c @@ -454,9 +454,6 @@ universal_nonlinear_rgb_converter (const Babl *conversion,unsigned char *src_cha float (*from_linear_blue) (void *trc, float value); float * matrixf = conversion->conversion.data; - const float mat[9] = {matrixf[0], matrixf[1],matrixf[2], - matrixf[3], matrixf[4],matrixf[5], - matrixf[6], matrixf[7],matrixf[8]}; int i; float *rgba_in = (void*)src_char; float *rgba_out = (void*)dst_char; @@ -478,13 +475,16 @@ universal_nonlinear_rgb_converter (const Babl *conversion,unsigned char *src_cha for (i = 0; i < samples; i++) { - rgba_out[0]=to_linear_red(to_trc_red, rgba_in[0]); - rgba_out[1]=to_linear_green(to_trc_green, rgba_in[1]); - rgba_out[2]=to_linear_blue(to_trc_blue, rgba_in[2]); - rgba_out[3]=rgba_in[3]; + rgba_out[i*4] =to_linear_red(to_trc_red, rgba_in[i*4]); + rgba_out[i*4+1]=to_linear_green(to_trc_green, rgba_in[i*4+1]); + rgba_out[i*4+2]=to_linear_blue(to_trc_blue, rgba_in[i*4+1]); + rgba_out[i*4+3]=rgba_in[3]; + } - babl_matrix_mul_vectorff (mat, rgba_out, rgba_out); + babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples); + for (i = 0; i < samples; i++) + { rgba_out[0] = from_linear_red(from_trc_red, rgba_out[0]); rgba_out[1] = from_linear_green(from_trc_green, rgba_out[1]); rgba_out[2] = from_linear_blue(from_trc_blue, rgba_out[2]); @@ -508,9 +508,6 @@ universal_nonlinear_rgb_linear_converter (const Babl *conversion,unsigned char * float (*to_linear_blue) (void *trc, float value); float * matrixf = conversion->conversion.data; - const float mat[9] = {matrixf[0], matrixf[1],matrixf[2], - matrixf[3], matrixf[4],matrixf[5], - matrixf[6], matrixf[7],matrixf[8]}; int i; float *rgba_in = (void*)src_char; float *rgba_out = (void*)dst_char; @@ -524,17 +521,15 @@ universal_nonlinear_rgb_linear_converter (const Babl *conversion,unsigned char * for (i = 0; i < samples; i++) { - rgba_out[0]=to_linear_red(to_trc_red, rgba_in[0]); - rgba_out[1]=to_linear_green(to_trc_green, rgba_in[1]); - rgba_out[2]=to_linear_blue(to_trc_blue, rgba_in[2]); - rgba_out[3]=rgba_in[3]; - - babl_matrix_mul_vectorff (mat, rgba_out, rgba_out); - + rgba_out[i*4]=to_linear_red(to_trc_red, rgba_in[0]); + rgba_out[i*4+1]=to_linear_green(to_trc_green, rgba_in[1]); + rgba_out[i*4+2]=to_linear_blue(to_trc_blue, rgba_in[2]); + rgba_out[i*4+3]=rgba_in[3]; rgba_in += 4; - rgba_out += 4; } + babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples); + return samples; } @@ -559,10 +554,7 @@ universal_nonlinear_rgba_u8_converter (const Babl *conversion,unsigned char *src rgb[i*3+2]=in_trc_lut[rgba_in_u8[i*4+2]]; } - for (i = 0; i < samples; i++) - { - babl_matrix_mul_vectorff (matrixf, &rgb[i*3], &rgb[i*3]); - } + babl_matrix_mul_vectorff_buf3 (matrixf, rgb, rgb, samples); { const Babl *from_trc_red = (void*)destination_space->space.trc[0]; @@ -601,8 +593,8 @@ universal_nonlinear_rgba_u8_float_converter (const Babl *conversion,unsigned cha rgba_out[i*3+1]=in_trc_lut[rgba_in_u8[i*4+1]]; rgba_out[i*3+2]=in_trc_lut[rgba_in_u8[i*4+2]]; rgba_out[i*3+2]=rgba_in_u8[i*4+3] / 255.0; - babl_matrix_mul_vectorff (matrixf, &rgba_out[i*4], &rgba_out[i*4]); } + babl_matrix_mul_vectorff_buf4 (matrixf, rgba_out, rgba_out, samples); return samples; } @@ -611,20 +603,10 @@ static inline long universal_rgba_converter (const Babl *conversion,unsigned char *src_char, unsigned char *dst_char, long samples) { float *matrixf = conversion->conversion.data; - float mat[9] = {matrixf[0], matrixf[1],matrixf[2], - matrixf[3], matrixf[4],matrixf[5], - matrixf[6], matrixf[7],matrixf[8]}; - int i; float *rgba_in = (void*)src_char; float *rgba_out = (void*)dst_char; - for (i = 0; i < samples; i++) - { - babl_matrix_mul_vectorff (mat, rgba_in, rgba_out); - rgba_out[3] = rgba_in[3]; - rgba_in += 4; - rgba_out += 4; - } + babl_matrix_mul_vectorff_buf4 (matrixf, rgba_in, rgba_out, samples); return samples; } @@ -634,19 +616,10 @@ static inline long universal_rgb_converter (const Babl *conversion,unsigned char *src_char, unsigned char *dst_char, long samples) { float *matrixf = conversion->conversion.data; - float mat[9] = {matrixf[0], matrixf[1],matrixf[2], - matrixf[3], matrixf[4],matrixf[5], - matrixf[6], matrixf[7],matrixf[8]}; - int i; float *rgb_in = (void*)src_char; float *rgb_out = (void*)dst_char; - for (i = 0; i < samples; i++) - { - babl_matrix_mul_vectorff (mat, rgb_in, rgb_out); - rgb_in += 3; - rgb_out += 3; - } + babl_matrix_mul_vectorff_buf3 (matrixf, rgb_in, rgb_out, samples); return samples; } diff --git a/babl/babl-fish-reference.c b/babl/babl-fish-reference.c index 8273da5..d97cca3 100644 --- a/babl/babl-fish-reference.c +++ b/babl/babl-fish-reference.c @@ -496,15 +496,13 @@ babl_fish_reference_process (const Babl *babl, ((babl->fish.destination)->format.space))) { double matrix[9]; - int i; double *rgba = rgba_double_buf; babl_matrix_mul_matrix ( (babl->fish.destination)->format.space->space.XYZtoRGB, (babl->fish.source)->format.space->space.RGBtoXYZ, matrix); - for (i = 0; i < n; i++) - babl_matrix_mul_vector (matrix, &rgba[i * 4], &rgba[i * 4]); + babl_matrix_mul_vector_buf4 (matrix, rgba, rgba, n); } { diff --git a/babl/babl-matrix.h b/babl/babl-matrix.h index 19707cb..d4e29d9 100644 --- a/babl/babl-matrix.h +++ b/babl/babl-matrix.h @@ -61,29 +61,133 @@ static inline void babl_matrix_invert (const double *in, double *out) static inline void babl_matrix_mul_vector (const double *mat, const double *v_in, double *v_out) { - double val[3]={v_in[0], v_in[1], v_in[2]}; - - v_out[0] = m(mat, 0, 0) * val[0] + m(mat, 0, 1) * val[1] + m(mat, 0, 2) * val[2]; - v_out[1] = m(mat, 1, 0) * val[0] + m(mat, 1, 1) * val[1] + m(mat, 1, 2) * val[2]; - v_out[2] = m(mat, 2, 0) * val[0] + m(mat, 2, 1) * val[1] + m(mat, 2, 2) * val[2]; + double a = v_in[0], b = v_in[1], c = v_in[2]; + double m_0_0 = m(mat, 0, 0); + double m_0_1 = m(mat, 0, 1); + double m_0_2 = m(mat, 0, 2); + double m_1_0 = m(mat, 1, 0); + double m_1_1 = m(mat, 1, 1); + double m_1_2 = m(mat, 1, 2); + double m_2_0 = m(mat, 2, 0); + double m_2_1 = m(mat, 2, 1); + double m_2_2 = m(mat, 2, 2); + + v_out[0] = m_0_0 * a + m_0_1 * b + m_0_2 * c; + v_out[1] = m_1_0 * a + m_1_1 * b + m_1_2 * c; + v_out[2] = m_2_0 * a + m_2_1 * b + m_2_2 * c; } static inline void babl_matrix_mul_vectorf (const double *mat, const float *v_in, float *v_out) { - float val[3]={v_in[0], v_in[1], v_in[2]}; - - v_out[0] = m(mat, 0, 0) * val[0] + m(mat, 0, 1) * val[1] + m(mat, 0, 2) * val[2]; - v_out[1] = m(mat, 1, 0) * val[0] + m(mat, 1, 1) * val[1] + m(mat, 1, 2) * val[2]; - v_out[2] = m(mat, 2, 0) * val[0] + m(mat, 2, 1) * val[1] + m(mat, 2, 2) * val[2]; + float a = v_in[0], b = v_in[1], c = v_in[2]; + float m_0_0 = m(mat, 0, 0); + float m_0_1 = m(mat, 0, 1); + float m_0_2 = m(mat, 0, 2); + float m_1_0 = m(mat, 1, 0); + float m_1_1 = m(mat, 1, 1); + float m_1_2 = m(mat, 1, 2); + float m_2_0 = m(mat, 2, 0); + float m_2_1 = m(mat, 2, 1); + float m_2_2 = m(mat, 2, 2); + + v_out[0] = m_0_0 * a + m_0_1 * b + m_0_2 * c; + v_out[1] = m_1_0 * a + m_1_1 * b + m_1_2 * c; + v_out[2] = m_2_0 * a + m_2_1 * b + m_2_2 * c; } static inline void babl_matrix_mul_vectorff (const float *mat, const float *v_in, float *v_out) { - float val[3]={v_in[0], v_in[1], v_in[2]}; + float a = v_in[0], b = v_in[1], c = v_in[2]; + float m_0_0 = m(mat, 0, 0); + float m_0_1 = m(mat, 0, 1); + float m_0_2 = m(mat, 0, 2); + float m_1_0 = m(mat, 1, 0); + float m_1_1 = m(mat, 1, 1); + float m_1_2 = m(mat, 1, 2); + float m_2_0 = m(mat, 2, 0); + float m_2_1 = m(mat, 2, 1); + float m_2_2 = m(mat, 2, 2); + + v_out[0] = m_0_0 * a + m_0_1 * b + m_0_2 * c; + v_out[1] = m_1_0 * a + m_1_1 * b + m_1_2 * c; + v_out[2] = m_2_0 * a + m_2_1 * b + m_2_2 * c; +} + +static inline void babl_matrix_mul_vectorff_buf3 (const float *mat, const float *v_in, float *v_out, + int samples) +{ + int i; + float m_0_0 = m(mat, 0, 0); + float m_0_1 = m(mat, 0, 1); + float m_0_2 = m(mat, 0, 2); + float m_1_0 = m(mat, 1, 0); + float m_1_1 = m(mat, 1, 1); + float m_1_2 = m(mat, 1, 2); + float m_2_0 = m(mat, 2, 0); + float m_2_1 = m(mat, 2, 1); + float m_2_2 = m(mat, 2, 2); + for (i = 0; i < samples; i ++) + { + float a = v_in[0], b = v_in[1], c = v_in[2]; + + v_out[0] = m_0_0 * a + m_0_1 * b + m_0_2 * c; + v_out[1] = m_1_0 * a + m_1_1 * b + m_1_2 * c; + v_out[2] = m_2_0 * a + m_2_1 * b + m_2_2 * c; + v_in += 3; + v_out += 3; + } +} + +static inline void babl_matrix_mul_vectorff_buf4 (const float *mat, const float *v_in, float *v_out, + int samples) +{ + int i; + float m_0_0 = m(mat, 0, 0); + float m_0_1 = m(mat, 0, 1); + float m_0_2 = m(mat, 0, 2); + float m_1_0 = m(mat, 1, 0); + float m_1_1 = m(mat, 1, 1); + float m_1_2 = m(mat, 1, 2); + float m_2_0 = m(mat, 2, 0); + float m_2_1 = m(mat, 2, 1); + float m_2_2 = m(mat, 2, 2); + for (i = 0; i < samples; i ++) + { + float a = v_in[0], b = v_in[1], c = v_in[2]; + + v_out[0] = m_0_0 * a + m_0_1 * b + m_0_2 * c; + v_out[1] = m_1_0 * a + m_1_1 * b + m_1_2 * c; + v_out[2] = m_2_0 * a + m_2_1 * b + m_2_2 * c; + v_out[3] = v_in[3]; + v_in += 4; + v_out += 4; + } +} - v_out[0] = m(mat, 0, 0) * val[0] + m(mat, 0, 1) * val[1] + m(mat, 0, 2) * val[2]; - v_out[1] = m(mat, 1, 0) * val[0] + m(mat, 1, 1) * val[1] + m(mat, 1, 2) * val[2]; - v_out[2] = m(mat, 2, 0) * val[0] + m(mat, 2, 1) * val[1] + m(mat, 2, 2) * val[2]; +static inline void babl_matrix_mul_vector_buf4 (const double *mat, const double *v_in, double *v_out, + int samples) +{ + int i; + double m_0_0 = m(mat, 0, 0); + double m_0_1 = m(mat, 0, 1); + double m_0_2 = m(mat, 0, 2); + double m_1_0 = m(mat, 1, 0); + double m_1_1 = m(mat, 1, 1); + double m_1_2 = m(mat, 1, 2); + double m_2_0 = m(mat, 2, 0); + double m_2_1 = m(mat, 2, 1); + double m_2_2 = m(mat, 2, 2); + for (i = 0; i < samples; i ++) + { + double a = v_in[0], b = v_in[1], c = v_in[2]; + + v_out[0] = m_0_0 * a + m_0_1 * b + m_0_2 * c; + v_out[1] = m_1_0 * a + m_1_1 * b + m_1_2 * c; + v_out[2] = m_2_0 * a + m_2_1 * b + m_2_2 * c; + v_out[3] = v_in[3]; + v_in += 4; + v_out += 4; + } } -- 2.30.2